from google.colab import files
files.upload()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from numpy import where
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from lightgbm import LGBMClassifier
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
# !pip install pycaret
# import pycaret
# from pycaret.utils import enable_colab
# enable_colab()
!pwd
df = pd.read_csv("/content/task_data.csv")
df.head()
df.info()
df.columns
features = df.columns[:-1]
features
'''The Spearman correlation evaluates the monotonic relationship between two continuous or ordinal variables.
In a monotonic relationship, the variables tend to change together, but not necessarily at a constant rate.
The Spearman correlation coefficient is based on the ranked values for each variable rather than the raw data.'''
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(np.round(df[features].corr(), 3), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features correlation', fontdict={'fontsize':10}, pad=10)
plt.title("Spearman correlation - train data")
plt.show()
# See Imbalance
pd.value_counts(df['y']).plot.bar()
plt.title('Class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
df['y'].value_counts()
df.head()
# Remove string
df['C'] = df['C'].str[:-5]
df['D'] = df['D'].str[:-1]
df['T'] = df['T'].str[:-1]
# Convert to Numeric
for col in ['C', 'D', 'T']:
df[col] = pd.to_numeric(df[col])
# for col_name in df.columns:
# if(df[col_name].dtype == 'object'):
# df[col_name]= df[col_name].astype('category')
# df[col_name] = df[col_name].cat.codes
# Normalize
from sklearn.preprocessing import StandardScaler, RobustScaler
features_to_scale = ['A', 'B', 'E', 'L', 'N', 'O', 'Q', 'S', 'T', 'V', 'W']
for col in features_to_scale:
df[col] = StandardScaler().fit_transform((df[col].values.reshape(-1, 1)))
# RobustScaler is less prone to outliers.
std_scaler = StandardScaler()
rob_scaler = RobustScaler()
for col in ['B', 'D','I','U']:
df[col] = rob_scaler.fit_transform(df[col].values.reshape(-1,1))
df.head()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in ['H', 'J', 'K']:
le.fit(df[col])
df[col] = le.transform(df[col])
df.head()
for col in ['C', 'F', 'G', 'M', 'P','R', 'X']:
df[col] = df[col].astype('category').cat.codes
df
continuous = ['A', 'B','D','E', 'I', 'L', 'N', 'O', 'Q', 'S','T', 'U', 'V', 'W']
cat = ['C', 'F', 'G', 'H', 'J', 'K', 'M', 'P', 'R', 'X']
def plot_feature_distribution(df1, features):
i = 0
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(5, 3,figsize=(14, 24))
for feature in features:
i += 1
plt.subplot(5, 3,i)
sns.distplot(df[feature],color="orange", kde=True,bins=60, label='train')
plt.xlabel(feature, fontsize=9); plt.legend()
plt.show();
# Numerical predictors
numerical_predictors = continuous
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=15, cols=2, subplot_titles=numerical_predictors)
for i, col in enumerate(continuous):
fig.add_trace(
go.Histogram(x=df[col]),
i//2 + 1, i%2 + 1)
fig.update_layout(
title_text='Numerical Predictors Distribution',
height=1200,
showlegend=False)
# df = df.fillna(-999)
# from sklearn.impute import SimpleImputer
# df = SimpleImputer(missing_value = np.nan, strategy = 'mean').fit_transform(df)
# df_imp = df.copy()
df.isna().sum()
df = df.dropna()
# df
# df_m = df.copy()
# df_bfm = df.copy()
# df = df_m.copy()
Determine the Classifiers we are going to use and decide which one has a higher accuracy.
Create a Neural Network and compare the accuracy to our best classifier.
We Don't use accuracy score as a metric with imbalanced datasets (will be usually high and misleading), instead use f1-score, precision/recall score or confusion matrix
new_df = df.copy()
import seaborn as sns
from scipy.stats import norm
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))
v14_dist = new_df['A'].loc[new_df['y'] == 1].values
sns.distplot(v14_dist,ax=ax1, fit=norm, color='#FB8861')
ax1.set_title('A Distribution \n (Distribution)', fontsize=14)
v12_dist = new_df['B'].loc[new_df['y'] == 1].values
sns.distplot(v12_dist,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('B Distribution \n ( Distribution)', fontsize=14)
v10_dist = new_df['D'].loc[new_df['y'] == 1].values
sns.distplot(v10_dist,ax=ax3, fit=norm, color='#C5B3F9')
ax3.set_title('D Distribution \n ( Distribution)', fontsize=14)
plt.show()
# # -----> A Removing Outliers (Highest Negative Correlated with Labels)
v14_fraud = new_df['A'].loc[new_df['y'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
v14_iqr = q75 - q25
print('iqr: {}'.format(v14_iqr))
v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
print('Cut Off: {}'.format(v14_cut_off))
print('V14 Lower: {}'.format(v14_lower))
print('V14 Upper: {}'.format(v14_upper))
outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
print('Feature V14 Outliers for Cases: {}'.format(len(outliers)))
print('V10 outliers:{}'.format(outliers))
# new_df = new_df.drop(new_df[(new_df['V14'] > v14_upper) | (new_df['V14'] < v14_lower)].index)
# print('----' * 44)
df.columns
import matplotlib.gridspec as gridspec
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(continuous):
ax = plt.subplot(gs[i])
sns.distplot(df[cn][df.y == 1], bins=50)
sns.distplot(df[cn][df.y == 0], bins=50)
ax.set_xlabel('')
ax.set_title('histogram of feature: ' + str(cn))
plt.show()
# Distribution
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
fig.append_trace(
go.Histogram(x=df['y']),
1, 1)
fig.append_trace(
go.Box(x=df['y'], name='y'),
2, 1)
fig.update_layout(title_text='Target Distribution', showlegend=False)
# Numerical predictors
numerical_predictors = continuous
fig = make_subplots(rows=20, cols=2, subplot_titles=numerical_predictors)
for i, col in enumerate(numerical_predictors):
fig.add_trace(
go.Histogram(x=df[col]),
i//2 + 1, i%2 + 1)
fig.update_layout(
title_text='Numerical Predictors Distribution',
height=1200,
showlegend=False)
# Categorical predictors
# categorical_predictors = [p for p in predictors if 'cat' in p]
categorical_predictors = cat
fig = make_subplots(rows=5, cols=2, subplot_titles=categorical_predictors)
for i, col in enumerate(categorical_predictors):
fig.add_trace(
go.Histogram(x=df[col]),
i//2 + 1, i%2 + 1)
fig.update_layout(
title_text='Categorical Predictors Distribution',
height=1000,
showlegend=False)
ex_df = df.copy()
# ex_df = ex_df.fillna(-999)
# Scaled Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(ex_df.drop('y',axis=1))
scaled_data = scaler.transform(ex_df.drop('y',axis=1))
# Dimensionality Reduction and Clustering:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)
x_pca.shape
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=ex_df['y'])
plt.xlabel('First principal component')
plt.ylabel('Second Principal Component')
X = np.array(df.drop(['y'], axis=1))
y = np.array(df['y'])
print('Shape of X: {}'.format(X.shape))
print('Shape of y: {}'.format(y.shape))
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))
# !pip install pycaret
# import pycaret
# from pycaret.utils import enable_colab
# enable_colab()
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
parameters = { 'C': np.linspace(1, 10, 10) }
lr = LogisticRegression()
clf = GridSearchCV(lr, parameters, cv=5, verbose=5, n_jobs=3)
clf.fit(X_train_res, y_train_res.ravel())
clf.best_params_
lr1 = LogisticRegression(C=6,penalty='l2', verbose=5)
lr1.fit(X_train_res, y_train_res.ravel())
import itertools
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#print("Normalized confusion matrix")
else:
1#print('Confusion matrix, without normalization')
#print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
y_train_pre = lr1.predict(X_train_res)
cnf_matrix_tra = confusion_matrix(y_train_res, y_train_pre)
print("Recall metric in the train dataset: {}%".format(100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()
y_pre = lr1.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pre)
print("Recall metric in the testing dataset: {}%".format(100*cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])))
#print("Precision metric in the testing dataset: {}%".format(100*cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[1,0])))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix , classes=class_names, title='Confusion matrix')
plt.show()
tmp = lr1.fit(X_train_res, y_train_res.ravel())
y_pred_sample_score = tmp.decision_function(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_sample_score)
roc_auc = auc(fpr,tpr)
# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print("ROC AUC score: ", roc_auc)
# style.use('ggplot')
sns.set_style('whitegrid')
plt.subplots(figsize = (30,30))
## Plotting heatmap. Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df.corr(), cmap=sns.diverging_palette(20, 220, n=200), annot=True, mask=mask, center = 0, );
plt.title("Heatmap of all the Features of Train data set", fontsize = 25);
#visualizing the features w high negative correlation
f, axes = plt.subplots(nrows=3, ncols=3, figsize=(25,15))
f.suptitle('Features With High Negative Correlation', size=35)
sns.boxplot(x="y", y="A", data=df, ax=axes[0,0])
sns.boxplot(x="y", y="B", data=df, ax=axes[0,1])
sns.boxplot(x="y", y="C", data=df, ax=axes[0,2])
sns.boxplot(x="y", y="D", data=df, ax=axes[1,0])
sns.boxplot(x="y", y="F", data=df, ax=axes[1,1])
sns.boxplot(x="y", y="G", data=df, ax=axes[1,2])
sns.boxplot(x="y", y="H", data=df, ax=axes[2,0])
sns.boxplot(x="y", y="I", data=df, ax=axes[2,1])
sns.boxplot(x="y", y="U", data=df, ax=axes[2,1])
f.delaxes(axes[2,2])
#visualizing the features w high positive correlation
f, axes = plt.subplots(nrows=1, ncols=5, figsize=(14,5))
f.suptitle('Features With High Positive Correlation', size=20)
sns.boxplot(x="y", y="N", data=df, ax=axes[0])
sns.boxplot(x="y", y="O", data=df, ax=axes[1])
sns.boxplot(x="y", y="V", data=df, ax=axes[2])
sns.boxplot(x="y", y="W", data=df, ax=axes[3])
sns.boxplot(x="y", y="U", data=df, ax=axes[4])
def plot_feature_distribution(df1, features):
i = 0
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(5, 3,figsize=(14, 24))
for feature in features:
i += 1
plt.subplot(5, 3,i)
sns.distplot(df[feature],color="orange", kde=True,bins=60, label='train')
plt.xlabel(feature, fontsize=9); plt.legend()
plt.show();
plot_feature_distribution(df,continuous)
Plot Categorical
# plt.style.use("ggplot")
plt.figure(figsize=(25,20))
for i,feature in enumerate(cat):
plt.subplot(2,5,i+1)
sns.countplot(df[feature])
def distribution3(feature,category,df=df):
plt.subplots(figsize=(15, 7))
sns.histplot(df,x=feature,hue=category)
distribution3('D','H')
def boxploting1(feature,category,df=df,figure_size=(15,7)):
plt.subplots(figsize=figure_size)
sns.boxplot(x=feature, y=category, data=df,whis=[0, 100], width=.6, palette="vlag")
LogisticRegression() LinearDiscriminantAnalysis() KNeighborsClassifier() RandomForestClassifier() DecisionTreeClassifier()
XGBClassifier() GaussianNB() GradientBoostingClassifier() LGBMClassifier()
def Definedata():
# define dataset
X=df.drop(columns=['y']).values
y=df['y'].values
return X, y
def SMOTE():
# borderline-SMOTE for imbalanced dataset
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot
from numpy import where
X, y = Definedata()
# summarize class distribution
counter = Counter(y)
print(counter)
# transform the dataset
smt = SMOTE(random_state=0)
X, y = smt.fit_sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=2)
# summarize the new class distribution
counter = Counter(y)
print(counter)
# scatter plot of examples by class label
for label, _ in counter.items():
row_ix = where(y == label)[0]
pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()
return X_train, X_test, y_train, y_test
# SMOTE()
# SMOTE() = (X_train1, X_test1, y_train1, y_test1)
def Models(models, X_train, X_test, y_train, y_test, title):
model = models
model.fit(X_train,y_train)
X, y = Definedata()
train_matrix = pd.crosstab(y_train, model.predict(X_train), rownames=['Actual'], colnames=['Predicted'])
test_matrix = pd.crosstab(y_test, model.predict(X_test), rownames=['Actual'], colnames=['Predicted'])
matrix = pd.crosstab(y, model.predict(X), rownames=['Actual'], colnames=['Predicted'])
f,(ax1,ax2,ax3) = plt.subplots(1,3,sharey=True, figsize=(20, 3))
#f = plt.figure(figsize=(20, 3))
g1 = sns.heatmap(train_matrix, annot=True, fmt=".1f", cbar=False,annot_kws={"size": 16},ax=ax1)
g1.set_title(title)
g1.set_ylabel('Total = {}'.format(y_train.sum()), fontsize=14, rotation=90)
g1.set_xlabel('Recall Accuracy score for Trainingset: {}'.format(recall_score(model.predict(X_train), y_train)))
g2 = sns.heatmap(test_matrix, annot=True, fmt=".1f",cbar=False,annot_kws={"size": 16},ax=ax2)
g2.set_ylabel('Total = {}'.format(y_test.sum()), fontsize=14, rotation=90)
g2.set_xlabel('Recall Accuracy score for Testingset: {}'.format(recall_score(model.predict(X_test), y_test)))
g3 = sns.heatmap(matrix, annot=True, fmt=".1f",cbar=False,annot_kws={"size": 16},ax=ax3)
g3.set_ylabel('Total = {}'.format(y.sum()), fontsize=14, rotation=90)
g3.set_xlabel('Recall Accuracy score for Totalset: {}'.format(recall_score(model.predict(X), y)))
plt.show()
return y, model.predict(X)
def Featureimportances(models, X_train, y_train):
model = models
model.fit(X_train,y_train)
importances = model.feature_importances_
features = df.columns[:24]
imp = pd.DataFrame({'Features': features, 'Importance': importances})
imp['Sum Importance'] = imp['Importance'].cumsum()
imp = imp.sort_values(by = 'Importance')
return imp
title = 'LogisticRegression/SMOTE'
%time Models(LogisticRegression(),X_train_res, X_test, y_train_res, y_test, title)
title = 'GradientBoostingClassifier/SMOTE'
%time Models(GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_features=2, max_depth=2, random_state=0),X_train_res, X_test, y_train_res, y_test, title)
%time Featureimportances(GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_features=2, max_depth=2, random_state=0),X_train_res, y_train_res)
imp = %time Featureimportances(GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_features=2, max_depth=2, random_state=0),X_train_res, y_train_res)
tmp = pd.DataFrame({'Feature': features, 'Feature importance': imp['Importance']})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()
If prediction set will have the X feature, then keeping X in the train set wouldnt be a problem in terms of Data Leakage. Otherwise, obviously, this would be problematic to rely on this model's performance. In the first case, probably I'd go for k-fold stratified cross-validation with pipeline build.
from sklearn.metrics import recall_score
from sklearn.neural_network import MLPClassifier
MLPC = MLPClassifier(hidden_layer_sizes=(200,), max_iter=10000)
MLPC.fit(X_train, y_train)
y_pred = MLPC.predict(X_test)
recall_acc = recall_score (y_test,y_pred)
recall_acc
I have other methods left to test out, such as TabNet, conversion of data to 3D for CNNs, an optimized NN model and few more. I can further can be improved to 96+, with more effort on feature engineering and optimized modelling.
df['B'].corr(df['I'])
df.corr().unstack().sort_values(ascending=False).drop_duplicates().head(10)
corr = df.corr()
kot = corr[corr >= 0.8]
plt.figure(figsize=(12, 8))
sns.heatmap(kot, cmap='Greens')
Correlation defines how the one feature corresponds to the other, on the goal of predicting the output alltogether with other variable. For example, for some datasets having features whose relationship is linear, we need to use a Linear model to corectly define the relationship(while avioiding multicollinearity), without needing to go for any Deeper Models. And also it is always better to choose most contributing feature, rather than more feature (PCA and t-sne is used for dim reduction and visually seeing relations) and thus, avoid adding unnecesaary noise which can mislead the model.
Already Normalized features for any case. Obviously, for linear models, we need to normalize, while tree based machines doesnt react to any non-normal distribution. We can use few normalization technics to scale and normalize data such as scaling, clipping, log scaling...
import matplotlib.gridspec as gridspec
plt.figure(figsize=(12,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(['U']):
ax = plt.subplot(gs[i])
sns.distplot(df[cn][df.y == 1], bins=50)
sns.distplot(df[cn][df.y == 0], bins=50)
ax.set_xlabel('')
ax.set_title('histogram of feature: ' + str(cn))
plt.show()
def plot_feature_distribution(df1, features):
i = 0
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(1, 1,figsize=(12, 10))
for feature in features:
i += 1
plt.subplot(1, 1,i)
sns.distplot(df[feature],color="orange", kde=True,bins=60, label='data')
plt.xlabel(feature, fontsize=9); plt.legend()
plt.show();
plot_feature_distribution(df,'U')
print("Mean value of Column U: ",df['U'].mean(), "\n", "STD value of Column U: ",df['U'].std())
df['U'].describe()
Such as, MinMaxScaler, StandardScaler, Box-cox, Log Scaling. etc can be used
df['D'].corr(df['H'])
np.corrcoef(df['D'], df['H'] )
0.06 shows a positive weak linear relationship between two relationship. We can also confirm the realationship with Linear Regression correlation test
# from scipy.stats import linregress
# linregress(df['H'],df['D'])
# Finding distribution of values for each cat value
# It seems that for the less amount of D column, more values were with Mortgage
pd.crosstab(df['D'], df['H'])
from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(df['D'], df['H']))
As seen, p-value of the result of the Chi-squared test is less than 0.05 ( 1.768878637916804e-228), which means we reject the null hypothesis that column D is not assisiated with column H. It seems that for the less amount of D column, more values were with Mortgage
For Feature selection, few methods like, Filter, wrapper, and embedded methods exit. Simply using correlation of variables or defining variance threshold can also help to find out more weighted variables contributing to the model.
Ill show one example from each type:
from sklearn.feature_selection import VarianceThreshold
thres = VarianceThreshold(threshold=0)
thres.fit(X)
thres.get_support()
from sklearn.feature_selection import mutual_info_classif
plt.figure(figsize=(12, 8))
importance = mutual_info_classif(X, y)
feature_importance = pd.Series(importance, df.columns[0: len(df.columns)-1])
feature_importance.plot(kind='barh')
Exhaustive Feature Selection, and Lasso can also be used
tmp = pd.DataFrame({'Feature': features, 'Feature importance': imp['Importance']})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=14)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90)
plt.show()
If prediction set will have the X feature, then keeping X in the train set wouldnt be a problem in terms of Data Leakage. Otherwise, obviously, this would be problematic to rely on this model's performance. In the first case, probably I'd go for k-fold stratified cross-validation with pipeline build.